In [4]:
import pandas as pd
iris_filename = 'datasets-uci-iris.csv'
iris = pd.read_csv(iris_filename, sep=',', decimal='.', header=None,
names= ['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
'target'])
In [5]:
# If the dataset is not available online, you can follow these steps to
# download it from the Internet:
try:
import urllib.request as urllib2
except ImportError:
import urllib2
#import urllib2
url = "http://aima.cs.berkeley.edu/data/iris.csv"
set1 = urllib2.Request(url)
iris_p = urllib2.urlopen(set1)
iris_other = pd.read_csv(iris_p, sep=',', decimal='.',
header=None, names= ['sepal_length', 'sepal_width',
'petal_length', 'petal_width', 'target'])
iris_other.head()
Out[5]:
In [6]:
iris.head()
Out[6]:
In [7]:
iris.tail()
Out[7]:
In [8]:
iris.head(2)
Out[8]:
In [9]:
iris.columns
Out[9]:
In [10]:
Y = iris['target']
Y
Out[10]:
In [11]:
X = iris[['sepal_length', 'sepal_width']]
X
Out[11]:
In [12]:
X.shape
Out[12]:
In [13]:
Y.shape
Out[13]:
In [14]:
import pandas as pd
fake_dataset = pd.read_csv('a_loading_example_1.csv', sep=',')
fake_dataset
Out[14]:
In [15]:
fake_dataset = pd.read_csv('a_loading_example_1.csv',
parse_dates=[0])
fake_dataset
Out[15]:
In [16]:
fake_dataset.fillna(50)
Out[16]:
In [17]:
fake_dataset.fillna(-1)
Out[17]:
In [18]:
fake_dataset.fillna(fake_dataset.mean(axis=0))
Out[18]:
In [19]:
bad_dataset = pd.read_csv('a_loading_example_2.csv',
error_bad_lines=False)
In [20]:
import pandas as pd
iris_chunks = pd.read_csv(iris_filename, header=None,
names=['C1', 'C2', 'C3', 'C4', 'C5'], chunksize=10)
for chunk in iris_chunks:
print chunk.shape
print chunk
In [21]:
iris_iterator = pd.read_csv(iris_filename, header=None,
names=['C1', 'C2', 'C3', 'C4', 'C5'], iterator=True)
In [22]:
print iris_iterator.get_chunk(10).shape
In [23]:
print iris_iterator.get_chunk(20).shape
In [24]:
piece = iris_iterator.get_chunk(2)
piece
Out[24]:
In [25]:
import csv
with open(iris_filename, 'rb') as data_stream:
for n, row in enumerate(csv.DictReader(data_stream,
fieldnames = ['sepal_length', 'sepal_width',
'petal_length', 'petal_width', 'target'],
dialect='excel')):
if n== 0:
print n,row
else:
break
In [26]:
with open(iris_filename, 'rb') as data_stream:
for n, row in enumerate(csv.reader(data_stream,
dialect='excel')):
if n==0:
print row
else:
break
In [27]:
def batch_read(filename, batch=5):
# open the data stream
with open(filename, 'rb') as data_stream:
# reset the batch
batch_output = list()
# iterate over the file
for n, row in enumerate(csv.reader(data_stream, dialect='excel')):
# if the batch is of the right size
if n > 0 and n % batch == 0:
# yield back the batch as an ndarray
yield(np.array(batch_output))
# reset the batch and restart
batch_output = list()
# otherwise add the row to the batch
batch_output.append(row)
# when the loop is over, yield what's left
yield(np.array(batch_output))
In [28]:
import numpy as np
for batch_input in batch_read(iris_filename, batch=3):
print batch_input
break
In [29]:
import pandas as pd
my_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':
[1.0]*5, 'Col3': 1.0, 'Col4': 'Hello World!'})
my_own_dataset
Out[29]:
In [30]:
my_wrong_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':
'string', 'Col3': range(2)})
In [31]:
my_own_dataset.dtypes
Out[31]:
In [32]:
my_own_dataset['Col1'] = my_own_dataset['Col1'].astype(float)
my_own_dataset.dtypes
Out[32]:
In [33]:
mask_feature = iris['sepal_length'] > 6.0
mask_feature
Out[33]:
In [34]:
mask_target = iris['target'] == 'Iris-virginica'
In [35]:
iris.loc[mask_target, 'target'] = 'New label'
In [36]:
iris['target'].unique()
Out[36]:
In [37]:
grouped_targets_mean = iris.groupby(['target']).mean()
grouped_targets_mean
Out[37]:
In [38]:
grouped_targets_var = iris.groupby(['target']).var()
grouped_targets_var
Out[38]:
In [39]:
iris.sort_index(by='sepal_length').head()
Out[39]:
In [40]:
# This is just an example, with no time_series data
# smooth_time_series = pd.rolling_mean(time_series, 5)
In [41]:
# This is just an example, with no time_series data
# median_time_series = pd.rolling_median(time_series, 5)
In [42]:
import pandas as pd
dataset = pd.read_csv('a_selection_example_1.csv')
dataset
Out[42]:
In [43]:
dataset = pd.read_csv('a_selection_example_1.csv', index_col=0)
dataset
Out[43]:
In [44]:
dataset['val3'][104]
Out[44]:
In [45]:
dataset.loc[104, 'val3']
Out[45]:
In [46]:
dataset.ix[104, 'val3']
Out[46]:
In [47]:
dataset.ix[104, 2]
Out[47]:
In [48]:
dataset.iloc[4, 2]
Out[48]:
In [49]:
dataset[['val3', 'val2']][0:2]
Out[49]:
In [50]:
dataset.loc[range(100, 102), ['val3', 'val2']]
Out[50]:
In [51]:
dataset.ix[range(100, 102), ['val3', 'val2']]
Out[51]:
In [52]:
dataset.ix[range(100, 102), [2, 1]]
Out[52]:
In [53]:
dataset.iloc[range(2), [2,1]]
Out[53]:
In [54]:
import pandas as pd
categorical_feature = pd.Series(['sunny', 'cloudy', 'snowy',
'rainy', 'foggy'])
mapping = pd.get_dummies(categorical_feature)
mapping
Out[54]:
In [55]:
mapping['sunny']
Out[55]:
In [56]:
mapping['cloudy']
Out[56]:
In [57]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ohe = OneHotEncoder()
levels = ['sunny', 'cloudy', 'snowy', 'rainy', 'foggy']
fit_levs = le.fit_transform(levels)
ohe.fit([[fit_levs[0]], [fit_levs[1]], [fit_levs[2]], [fit_levs[3]],
[fit_levs[4]]])
print ohe.transform([le.transform(['sunny'])]).toarray()
print ohe.transform([le.transform(['cloudy'])]).toarray()
In [60]:
from sklearn.datasets import fetch_20newsgroups
categories = ['sci.med', 'sci.space']
twenty_sci_news = fetch_20newsgroups(categories=categories)
In [59]:
twenty_sci_news.data[0]
In [ ]:
twenty_sci_news.filenames
In [ ]:
print twenty_sci_news.target[0]
print twenty_sci_news.target_names[twenty_sci_news.target[0]]
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
word_count = count_vect.fit_transform(twenty_sci_news.data)
word_count.shape
In [ ]:
print word_count[0]
In [ ]:
word_list = count_vect.get_feature_names()
for n in word_count[0].indices:
print "Word:", word_list[n], "appears", word_count[0, n], "times"
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vect = TfidfVectorizer(use_idf=False, norm='l1')
word_freq = tf_vect.fit_transform(twenty_sci_news.data)
word_list = tf_vect.get_feature_names()
for n in word_freq[0].indices:
print "Word:", word_list[n], "has frequency", word_freq[0, n]
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer() # Default: use_idf=True
word_tfidf = tfidf_vect.fit_transform(twenty_sci_news.data)
word_list = tfidf_vect.get_feature_names()
for n in word_tfidf[0].indices:
print "Word:", word_list[n], "has tfidf", word_tfidf[0, n]
In [ ]:
text_1 = 'we love data science'
text_2 = 'data science is hard'
documents = [text_1, text_2]
documents
In [ ]:
# That is what we say above, the default one
count_vect_1_grams = CountVectorizer(ngram_range=(1, 1),
stop_words=[], min_df=1)
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print "Word list = ", word_list
print "text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices]
In [ ]:
# Now a bi-gram count vectorizer
count_vect_1_grams = CountVectorizer(ngram_range=(2, 2))
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print "Word list = ", word_list
print "text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices]
In [ ]:
# Now a uni- and bi-gram count vectorizer
count_vect_1_grams = CountVectorizer(ngram_range=(1, 2))
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print "Word list = ", word_list
print "text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices]
In [ ]:
from sklearn.feature_extraction.text import HashingVectorizer
hash_vect = HashingVectorizer(n_features=1000)
word_hashed = hash_vect.fit_transform(twenty_sci_news.data)
word_hashed.shape
In [ ]:
import numpy as np
# Transform a list into a uni-dimensional array
list_of_ints = [1,2,3]
Array_1 = np.array(list_of_ints)
Array_1
In [ ]:
Array_1[1] # let's output the second value
In [ ]:
type(Array_1)
In [ ]:
Array_1.dtype # Note: The default dtype depends on the system you're operating.
In [ ]:
import numpy as np
Array_1.nbytes # Please note that on 64bit platforms the result will be 24.
In [ ]:
Array_1 = np.array(list_of_ints, dtype= 'int8')
In [ ]:
Array_1b = Array_1.astype('float32')
Array_1b
In [ ]:
import numpy as np
complex_list = [1,2,3] + [1.,2.,3.] + ['a','b','c']
Array_2 = np.array(complex_list[:3]) # at first the input list is just ints
print 'complex_list[:3]', Array_2.dtype
Array_2 = np.array(complex_list[:6]) # then it is ints and floats
print 'complex_list[:6]', Array_2.dtype
Array_2 = np.array(complex_list) # finally we add strings
print 'complex_list[:] ',Array_2.dtype
In [ ]:
# Check if a NumPy array is of the desired numeric type
print isinstance(Array_2[0],np.number)
In [ ]:
import numpy as np
# Transform a list into a bidimensional array
a_list_of_lists = [[1,2,3],[4,5,6],[7,8,9]]
Array_2D = np.array(a_list_of_lists )
Array_2D
In [ ]:
Array_2D[1,1]
In [ ]:
# Transform a list into a multi-dimensional array
a_list_of_lists_of_lists = [[[1,2],[3,4],[5,6]],
[[7,8],[9,10],[11,12]]]
Array_3D = np.array(a_list_of_lists_of_lists)
Array_3D
In [ ]:
Array_3D[0,2,0] # Accessing the 5th element
In [ ]:
np.array({1:2,3:4,5:6}.items())
In [ ]:
import numpy as np
# Restructuring a NumPy array shape
original_array = np.array([1, 2, 3, 4, 5, 6, 7, 8])
Array_a = original_array.reshape(4,2)
Array_b = original_array.reshape(4,2).copy()
Array_c = original_array.reshape(2,2,2)
# Attention because reshape creates just views, not copies
original_array[0] = -1
In [ ]:
Array_a
In [ ]:
Array_c
In [ ]:
Array_b
In [ ]:
original_array.resize(4,2)
original_array
In [ ]:
original_array.shape = (4,2)
In [ ]:
original_array
In [ ]:
import numpy as np
ordinal_values = np.arange(9).reshape(3,3)
ordinal_values
In [ ]:
np.arange(9)[::-1]
In [ ]:
np.random.randint(low=1,high=10,size=(3,3)).reshape(3,3)
In [ ]:
np.zeros((3,3))
In [ ]:
np.ones((3,3))
In [ ]:
np.eye(3)
In [ ]:
fractions = np.linspace(start=0, stop=1, num=10)
fractions
In [ ]:
growth = np.logspace(start=0, stop=1, num=10, base=10.0)
growth
In [ ]:
std_gaussian = np.random.normal(size=(3,3))
std_gaussian
In [ ]:
gaussian = np.random.normal(loc=1.0, scale= 3.0, size=(3,3))
gaussian
In [ ]:
np.random.uniform(low=0.0, high=1.0, size=(3,3))
In [ ]:
import numpy as np
housing = np.loadtxt('regression-datasets-housing.csv',delimiter=',', dtype=float)
In [ ]:
np.loadtxt('datasets-uci-iris.csv',delimiter=',',dtype=float)
In [ ]:
import pandas as pd
import numpy as np
housing_filename = 'regression-datasets-housing.csv'
housing = pd.read_csv(housing_filename, header=None)
In [ ]:
housing_array = housing.values
housing_array.dtype
In [ ]:
housing.dtypes
In [ ]:
import numpy as np
a = np.arange(5).reshape(1,5)
a += 1
a*a
In [ ]:
a = np.arange(5).reshape(1,5) + 1
b = np.arange(5).reshape(5,1) + 1
a * b
In [ ]:
a2 = np.array([1,2,3,4,5] * 5).reshape(5,5)
b2 = a2.T
a2 * b2
In [ ]:
print a2
In [ ]:
np.sum(a2, axis=0)
In [ ]:
np.sum(a2, axis=1)
In [ ]:
%timeit -n 1 -r 3 [i+1.0 for i in range(10**6)]
%timeit -n 1 -r 3 np.arange(10**6)+1.0
In [ ]:
import math
%timeit -n 1 -r 3 [math.sqrt(i) for i in range(10**6)]
In [ ]:
%timeit -n 1 -r 3 np.sqrt(np.arange(10**6))
In [ ]:
import numpy as np
M = np.arange(5*5, dtype=float).reshape(5,5)
M
In [ ]:
coefs = np.array([1., 0.5, 0.5, 0.5, 0.5])
coefs_matrix = np.column_stack((coefs,coefs[::-1]))
print coefs_matrix
In [ ]:
np.dot(M,coefs)
In [ ]:
np.dot(coefs,M)
In [ ]:
np.dot(M,coefs_matrix)
In [ ]:
import numpy as np
M = np.arange(10*10, dtype=int).reshape(10,10)
In [ ]:
M[2:9:2,:]
In [ ]:
M[2:9:2,5:]
In [ ]:
M[2:9:2,5::-1]
In [ ]:
# In the book the output of this cell is wrong.
# Here is reported the correct output.
row_index = (M[:,0]>=20) & (M[:,0]<=80)
col_index = M[0,:]>=5
M[row_index,:][:,col_index]
In [ ]:
mask = (M>=20) & (M<=90) & ((M / 10.) % 1 >= 0.5)
M[mask]
In [ ]:
row_index = [1,1,2,7]
col_index = [0,2,4,8]
In [ ]:
M[row_index,col_index]
In [ ]:
M[row_index,:][:,col_index]
In [ ]:
N = M[2:9:2,5:].copy()
In [ ]:
import numpy as np
dataset = np.arange(10*5).reshape(10,5)
In [ ]:
single_line = np.arange(1*5).reshape(1,5)
a_few_lines = np.arange(3*5).reshape(3,5)
In [ ]:
np.vstack((dataset,single_line))
In [ ]:
np.vstack((dataset,a_few_lines))
In [ ]:
np.vstack((dataset,single_line,single_line))
In [ ]:
bias = np.ones(10).reshape(10,1)
np.hstack((dataset,bias))
In [ ]:
bias = np.ones(10)
np.column_stack((dataset,bias))
In [ ]:
np.dstack((dataset*1,dataset*2,dataset*3))
In [ ]:
np.insert(dataset, 3, bias, axis=1)
In [ ]:
np.insert(dataset, 3, dataset.T, axis=1)
In [ ]:
np.insert(dataset, 3, np.ones(5), axis=0)